#!/usr/bin/python
# coding=utf-8
"""
自贡人才
"""

from pyquery import PyQuery as pq
from sqlalchemy import Column, String,Integer, create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import re
import urllib
import os

Base = declarative_base()

class User(Base):
    __tablename__ = 'user'

    id = Column(Integer,primary_key=True)
    name = Column(String)
    sex = Column(String(4))
    birthday = Column(String(16))
    height = Column(Integer)
    idcard = Column(String(18))
    education = Column(String(8))
    school = Column(String)
    profession = Column(String)
    dty = Column(String)
    address = Column(String)
    tel = Column(String)
    phone = Column(String)
    english = Column(String)
    mail = Column(String)
    address1 = Column(String)
    year = Column(Integer)
    photo = Column(String)

    def __str__(self):
        return u'(User: %s, %s, %s, %s, %s)' % (self.id,self.name,self.sex,self.birthday,self.year)

engine = create_engine('sqlite:///people.db')#,echo=True)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

def crawlOne(id):
    url = "http://www.zgrc114.com/graduate/EditView?id="+str(id)
    doc = pq(url)
    user = session.query(User).filter(User.id==id).one_or_none()
    if user:
        return user
    user = User()
    user.id = id
    user.name = doc("#mp_name").val()
    m = re.search(r"mp_sex\]\[value=([^\]])",doc.html())
    if m:
        user.sex = m.group(1)
    user.birthday = doc("#mp_age").val()
    if not user.name:
        print ""
        print "没有名字: %s" % url
        return None
    user.height = doc("#mp_height").val()
    user.idcard = doc("#mp_idcard").val()
    user.education = doc("#mp_education").val()
    user.school = doc("#mp_school").val()
    user.profession = doc("#mp_profession").val()
    m = re.search(r"#mp_dty'\).attr\('value','([^']+)",doc.html())
    if m:
        user.dty = m.group(1)
    user.address = doc("#mp_address").val()
    user.tel = doc("#mp_tel").val()
    user.phone = doc("#mp_phone").val()
    user.english = doc("#mp_wydj").val()
    user.mail = doc("#mp_mail").val()
    user.address1 = doc("#mp_address1").val()
    m = re.search(r"#mp_bysj'\).attr\('value','([^']+)",doc.html())
    if m:
        user.year = m.group(1)
    user.photo = doc("img").eq(1).attr.src
    saveFile(user.photo)
    session.add(user)
    session.commit()
    print "%s" % user
    return user

def saveFile(url):
    filename = url.split("/")[-1]
    url = "http://www.zgrc114.com"+url
    urllib.urlretrieve(url,"photo/"+filename)

if not os.path.exists("photo"):
    os.mkdir("photo")

for x in range(69116,81709):
    user = crawlOne(x)
    # if not user:
    #     raw_input("continue?")
# crawlOne(78582)
# saveFile('/public/upload/photo/1_510311199308232323.jpeg')
